package crawler;

import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.net.HttpRequest;
import cn.edu.hfut.dmic.webcollector.net.HttpResponse;
import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;

import java.net.InetSocketAddress;
import java.net.Proxy;

/**
 * Created by User on 2017/12/14.
 */
public class NewsCrawler extends BreadthCrawler{

    public static void main(String[] args) throws Exception {
        NewsCrawler crawl = new NewsCrawler("crawl", true);

        crawl.start(1);
    }

    public NewsCrawler(String crawlPath, boolean autoParse) {
        super(crawlPath, autoParse);
        this.addSeed("https://www.liaoxuefeng.com/");

        this.addRegex(".*");

    }

    @Override
    public void visit(Page page, CrawlDatums crawlDatums) {

    }

    @Override
    protected void afterParse(Page page, CrawlDatums next) {

        for (CrawlDatum crawlDatum:
             next) {
            String url = crawlDatum.url();
            System.out.println(url);
        }

    }


    @Override
    public HttpResponse getResponse(CrawlDatum crawlDatum) throws Exception {
        HttpRequest request;
        // 代理请求设置
        // HttpRequest request = new HttpRequest(crawlDatum, new Proxy(Proxy.Type.HTTP, new InetSocketAddress("hostname", 1111)));
        request = new HttpRequest(crawlDatum);
        return request.response();

    }
}
